import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from glob import glob
from itertools import chain
from random import sample
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import roc_curve, auc, precision_recall_curve, f1_score, confusion_matrix, average_precision_score, accuracy_score
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import GlobalAveragePooling2D, Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam
from keras.applications.densenet import DenseNet121
%matplotlib inline
plt.rcParams.update({'font.size': 16})
# Loading the NIH data into dataframe together with full image filepaths for easier manipulation
all_xray_df = pd.read_csv('/data/Data_Entry_2017.csv')
all_image_paths = {os.path.basename(x): x for x in
glob(os.path.join('/data','images*', '*', '*.png'))}
print('Scans found:', len(all_image_paths), ', Total Headers', all_xray_df.shape[0])
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)
all_xray_df.sample(3)
# Splitting the "Finding Labels" columns to have a binary column for each finding
finding_labels = np.unique(list(chain(*all_xray_df['Finding Labels'].apply(lambda x: x.split('|')).tolist())))
for label in finding_labels:
all_xray_df[label] = all_xray_df['Finding Labels'].apply(lambda x: 1.0 if label in x else 0)
# Creating a new column called 'pneumonia_class' for binary classification
all_xray_df['pneumonia_class']=all_xray_df['Pneumonia'].replace({0.0:'Negative',1.0:'Positive'})
# Moving the index to a separate column 'Scan_ID' for later manipulation during train/validation/test splits
all_xray_df['Scan_ID'] = all_xray_df.index
all_xray_df.head()
Goals:
# Splitting the data into two portions:
# 1) 80% training and validation datasets
# 2) 20% test dataset
# in such a way that there is no Patient ID overlap between the sets
train_valid_inds, test_inds = next(GroupShuffleSplit(test_size=0.2,
n_splits=1,
random_state = 16).split(all_xray_df, groups=all_xray_df['Patient ID']))
train_valid_df = all_xray_df.loc[train_valid_inds]
test_df = all_xray_df.loc[test_inds]
# Resetting index for further split
train_valid_df.reset_index(inplace=True)
train_valid_df.head()
# Splitting the train_valid dataset into two portions:
# 1) 90% training dataset
# 2) 10% validation dataset
# in such a way that there is no Patient ID overlap between the sets
train_inds, valid_inds = next(GroupShuffleSplit(test_size=0.1,
n_splits=1,
random_state = 16).split(train_valid_df, groups=train_valid_df['Patient ID']))
train_df = train_valid_df.loc[train_inds]
valid_df = train_valid_df.loc[valid_inds]
# Confirming that there is no Scan_ID overlap between the datasets
train_scan_ids = train_df['Scan_ID'].to_numpy()
valid_scan_ids = valid_df['Scan_ID'].to_numpy()
test_scan_ids = test_df['Scan_ID'].to_numpy()
print(f'Scan ID overlap between train and valid datasets: {np.intersect1d(train_scan_ids, valid_scan_ids)}')
print(f'Scan ID overlap between train and test datasets: {np.intersect1d(train_scan_ids, valid_scan_ids)}')
print(f'Scan ID overlap between valid and test datasets: {np.intersect1d(valid_scan_ids, test_scan_ids)}')
# Confirming that there is no Patient_ID overlap between the datasets
train_patient_ids = train_df['Patient ID'].to_numpy()
valid_patient_ids = valid_df['Patient ID'].to_numpy()
test_patient_ids = test_df['Patient ID'].to_numpy()
print(f'Patient ID overlap between train and valid datasets: {np.intersect1d(np.unique(train_patient_ids), np.unique(valid_patient_ids))}')
print(f'Patient ID overlap between train and test datasets: {np.intersect1d(np.unique(train_patient_ids), np.unique(test_patient_ids))}')
print(f'Patient ID overlap between valid and test datasets: {np.intersect1d(np.unique(valid_patient_ids), np.unique(test_patient_ids))}')
# Checking how much pneumonia cases are there in the training and in the validation datasets
train_df_pneumonia_cases = len(train_df[train_df.Pneumonia==1])
valid_df_pneumonia_cases = len(valid_df[valid_df.Pneumonia==1])
test_df_pneumonia_cases = len(test_df[test_df.Pneumonia==1])
print(f'Pneumonia cases in the training set: {train_df_pneumonia_cases}.')
print(f'Pneumonia cases in the validation set: {valid_df_pneumonia_cases}.')
print(f'Pneumonia cases in the test set: {test_df_pneumonia_cases}.')
print(f'Percentage of pneumonia cases, which were assigned to the training set: {train_df_pneumonia_cases/(train_df_pneumonia_cases + valid_df_pneumonia_cases + test_df_pneumonia_cases)*100:.3}%')
print(f'Percentage of pneumonia cases, which were assigned to the validation set: {valid_df_pneumonia_cases/(train_df_pneumonia_cases + valid_df_pneumonia_cases + test_df_pneumonia_cases)*100:.3}%')
print(f'Percentage of pneumonia cases, which were assigned to the test set: {test_df_pneumonia_cases/(train_df_pneumonia_cases + valid_df_pneumonia_cases + test_df_pneumonia_cases)*100:.3}%')
# Sampling randomly negative cases, such that the traning dataset consist of 50% positive cases.
train_scan_ids_pos = train_df[train_df.Pneumonia==1].index.to_numpy()
train_scan_ids_neg = train_df[train_df.Pneumonia==0].index.to_numpy()
train_scan_ids_neg_sample = np.random.choice(train_scan_ids_neg, size=train_df_pneumonia_cases, replace=False)
train_scan_ids_pos_neg_sample = np.concatenate((train_scan_ids_pos, train_scan_ids_neg_sample))
train_df = train_df.loc[train_scan_ids_pos_neg_sample]
train_df = train_df.sample(frac=1)
train_df.head()
# Sampling randomly negative cases, such that the validation dataset consist of 50% positive cases.
valid_scan_ids_pos = valid_df[valid_df.Pneumonia==1].index.to_numpy()
valid_scan_ids_neg = valid_df[valid_df.Pneumonia==0].index.to_numpy()
valid_scan_ids_neg_sample = np.random.choice(valid_scan_ids_neg, size=valid_df_pneumonia_cases, replace=False)
valid_scan_ids_pos_neg_sample = np.concatenate((valid_scan_ids_pos, valid_scan_ids_neg_sample))
valid_df = valid_df.loc[valid_scan_ids_pos_neg_sample]
valid_df = valid_df.sample(frac=1)
# Sampling randomly negative cases, such that the test dataset consist of 20% positive cases.
test_scan_ids_pos = test_df[test_df.Pneumonia==1].index.to_numpy()
test_scan_ids_neg = test_df[test_df.Pneumonia==0].index.to_numpy()
test_scan_ids_neg_sample = np.random.choice(test_scan_ids_neg, size=4*test_df_pneumonia_cases, replace=False)
test_scan_ids_pos_neg_sample = np.concatenate((test_scan_ids_pos, test_scan_ids_neg_sample))
test_df = test_df.loc[test_scan_ids_pos_neg_sample]
test_df = test_df.sample(frac=1)
# Determination of the number of scans in each dataset
print(f'Number of scans in the training set: {len(train_df)}.')
print(f'Number of scans in the validation set: {len(valid_df)}.')
print(f'Number of scans in the test set: {len(test_df)}.')
# Distribution of findings other than pneumonia that are present in the scans
finding_labels_no_pneumonia = np.delete(finding_labels, [10,13])
#deleting "no finding" and "pneumonia" label
finding_labels_no_pneumonia_occurance_train = train_df.loc[:,finding_labels_no_pneumonia].sum()/len(train_df.loc[:,finding_labels_no_pneumonia])
finding_labels_no_pneumonia_occurance_valid = valid_df.loc[:,finding_labels_no_pneumonia].sum()/len(valid_df.loc[:,finding_labels_no_pneumonia])
finding_labels_no_pneumonia_occurance_test = test_df.loc[:,finding_labels_no_pneumonia].sum()/len(test_df.loc[:,finding_labels_no_pneumonia])
fig, ax = plt.subplots(3,1, figsize=(15,20))
ax[0].bar(finding_labels_no_pneumonia, finding_labels_no_pneumonia_occurance_train, color='blue')
ax[0].set_title("Training set: Distribution of findings other than pneumonia")
ax[0].set_ylabel('% of scans')
ax[0].set_xticklabels(finding_labels_no_pneumonia, rotation=90)
ax[1].bar(finding_labels_no_pneumonia, finding_labels_no_pneumonia_occurance_valid, color='red')
ax[1].set_title("Validation set: Distribution of findings other than pneumonia")
ax[1].set_ylabel('% of scans')
ax[1].set_xticklabels(finding_labels_no_pneumonia, rotation=90)
ax[2].bar(finding_labels_no_pneumonia, finding_labels_no_pneumonia_occurance_test, color='green')
ax[2].set_title("Test set: Distribution of findings other than pneumonia")
ax[2].set_ylabel('% of scans')
ax[2].set_xticklabels(finding_labels_no_pneumonia, rotation=90)
fig.tight_layout()
# Distribution of co-ocurrent findings together with pneumonia
fig, ax = plt.subplots(1,3, figsize=(20,6))
fig.suptitle("Pneumonia and most common coexisting findings", fontsize=22, y = 1, x = 0.52)
train_df_coocurrent = train_df[train_df.Pneumonia==1]['Finding Labels'].value_counts()[0:5]
valid_df_coocurrent = valid_df[valid_df.Pneumonia==1]['Finding Labels'].value_counts()[0:5]
test_df_coocurrent = test_df[test_df.Pneumonia==1]['Finding Labels'].value_counts()[0:5]
ax[0].bar(train_df_coocurrent.index, train_df_coocurrent, color='blue')
ax[0].set_title("Training dataset")
ax[0].set_xticklabels(train_df[train_df.Pneumonia==1]['Finding Labels'][0:5], rotation=90)
ax[0].set_ylabel('# of scans')
ax[1].bar(valid_df_coocurrent.index, valid_df_coocurrent, color='red')
ax[1].set_title("Validation dataset")
ax[1].set_xticklabels(valid_df[valid_df.Pneumonia==1]['Finding Labels'][0:5], rotation=90)
ax[2].bar(test_df_coocurrent.index, test_df_coocurrent, color='green')
ax[2].set_title("Test dataset")
ax[2].set_xticklabels(test_df[test_df.Pneumonia==1]['Finding Labels'][0:5], rotation=90)
plt.show()
Coocurent findings:
Coocurence with Edema, Hernia and Fibrosis:
During in-depth evaluation of the best model, possible cross-talk existance between pneumonia and the most common coocurent findings has to be investigated.
# Distribution of age
fig, ax = plt.subplots(1,3, figsize=(15,6))
fig.suptitle("Patients' age distribution", fontsize=22, y = 1, x = 0.52)
ax[0].hist(train_df['Patient Age'], bins=50, range=(0,100), color='blue')
ax[0].set_title("Training data set")
ax[0].set_xlabel('Age')
ax[0].set_ylabel('# of X-ray scans')
ax[1].hist(valid_df['Patient Age'], bins=50, range=(0,100), color='red')
ax[1].set_title("Validation dataset")
ax[1].set_xlabel('Age')
ax[2].hist(test_df['Patient Age'], bins=50, range=(0,100), color='green')
ax[2].set_title("Test dataset")
ax[2].set_xlabel('Age')
fig.tight_layout(pad=2.0)
plt.show()
# Distribution of patients' gender
train_df_gender = train_df['Patient Gender'].value_counts()
valid_df_gender = valid_df['Patient Gender'].value_counts()
test_df_gender = test_df['Patient Gender'].value_counts()
fig, ax = plt.subplots(1,3, figsize=(15,6))
fig.suptitle("Patients' gender distribution", fontsize=22, y = 1, x = 0.52)
ax[0].bar(train_df_gender.index, train_df_gender, color='blue')
ax[0].set_title("Training dataset")
ax[0].set_xlabel('Gender')
ax[0].set_ylabel('# of X-ray scans')
ax[1].bar(valid_df_gender.index, valid_df_gender, color='red')
ax[1].set_title("Validation dataset")
ax[1].set_xlabel('Gender')
ax[2].bar(test_df_gender.index, test_df_gender, color='green')
ax[2].set_title("Test dataset")
ax[2].set_xlabel('Gender')
fig.tight_layout(pad=2.0)
plt.show()
M_F_ratio_train = train_df['Patient Gender'].value_counts()[0] / train_df['Patient Gender'].value_counts()[1]
print(f'M/F ratio: Training dataset {M_F_ratio_train:.2}.')
M_F_ratio_valid = valid_df['Patient Gender'].value_counts()[0] / valid_df['Patient Gender'].value_counts()[1]
print(f'M/F ratio: Validation dataset {M_F_ratio_valid:.2}.')
M_F_ratio_test = test_df['Patient Gender'].value_counts()[0] / test_df['Patient Gender'].value_counts()[1]
print(f'M/F ratio: Test dataset {M_F_ratio_test:.2}.')
# Distribution of view position in the train and the validaton set
train_df_view = train_df['View Position'].value_counts()
valid_df_view = valid_df['View Position'].value_counts()
test_df_view = test_df['View Position'].value_counts()
fig, ax = plt.subplots(1,3, figsize=(15,6))
fig.suptitle("View positions", fontsize=22, y = 1, x = 0.52)
ax[0].bar(train_df_view.index, train_df_view, color='blue')
ax[0].set_title("Training dataset")
ax[0].set_xlabel('View position')
ax[0].set_ylabel('# of X-ray scans')
ax[1].bar(valid_df_view.index, valid_df_view, color='red')
ax[1].set_title("Validation dataset")
ax[1].set_xlabel('View position')
ax[2].bar(test_df_view.index, test_df_view, color='green')
ax[2].set_title("Test dataset")
ax[2].set_xlabel('View position')
fig.tight_layout(pad=2.0)
plt.show()
PA_AP_ratio_train = train_df['View Position'].value_counts()[0] / train_df['View Position'].value_counts()[1]
print(f'PA/AP ratio: Training dataset {PA_AP_ratio_train:.2}.')
PA_AP_ratio_valid = valid_df['View Position'].value_counts()[0] / valid_df['View Position'].value_counts()[1]
print(f'PA/AP ratio: Validation dataset {PA_AP_ratio_valid:.2}.')
PA_AP_ratio_test = test_df['View Position'].value_counts()[0] / test_df['View Position'].value_counts()[1]
print(f'PA/AP ratio: Test dataset {PA_AP_ratio_test:.2}.')
PA projection:
AP projection:
Differences between PA and AP views:
Ref: https://www.radiologymasterclass.co.uk/tutorials/chest/chest_quality/chest_xray_quality_projection
# Distribution of number of scans per patient in the train and the validaton set
fig, ax = plt.subplots(1,3, figsize=(15,6))
fig.suptitle("Numer of scans per patient distribution", fontsize=22, y = 1, x = 0.52)
ax[0].hist(train_df['Patient ID'].value_counts(), bins=20, log=True, color='blue')
ax[0].set_title("Train dataset")
ax[0].set_xlabel('# of scans')
ax[0].set_ylabel('# of patients')
ax[1].hist(valid_df['Patient ID'].value_counts(), bins=20, log=True, color='red')
ax[1].set_title("Validation dataset")
ax[1].set_xlabel('# of scans')
ax[2].hist(test_df['Patient ID'].value_counts(), bins=20, log=True, color='green')
ax[2].set_title("Test dataset:")
ax[2].set_xlabel('# of scans')
fig.tight_layout(pad=2.0)
plt.show()
# Saving train_df, valid_df and test_df
train_df.to_pickle('train_df.pkl')
valid_df.to_pickle('valid_df.pkl')
test_df.to_pickle('test_df.pkl')
# Loading train_df, valid_df and test_df
train_df = pd.read_pickle('train_df.pkl')
valid_df = pd.read_pickle('valid_df.pkl')
test_df = pd.read_pickle('test_df.pkl')
def image_generator_train(samplewise_center=True, samplewise_std_normalization=True,
height_shift_range=0.1, width_shift_range=0.1,
rotation_range=2, zoom_range=0.01):
"""
Return training image data generator with augumented images
Args:
samplewise_center (Boolean): Set each sample mean to 0.
samplewise_std_normalization (Boolean): Divide each input by its std.
height_shift_range (Float): Fraction of total height.
width_shift_range (Float): Fraction of total width.
rotation_range (Int): Degree range for random rotations.
zoom_range (Float or [lower, upper]): Range for random zoom.
Returns:
train_idg (ImageDataGenrator): image generator for training set
"""
train_idg = ImageDataGenerator(samplewise_center = samplewise_center,
samplewise_std_normalization = samplewise_std_normalization,
height_shift_range = height_shift_range,
width_shift_range = width_shift_range,
rotation_range = rotation_range,
zoom_range = zoom_range)
return train_idg
def get_generator_train(train_idg, train_df, directory=None, x_col='path', y_col='pneumonia_class',
class_mode='binary', target_size=(224, 224), batch_size=64, shuffle=True, seed=16):
"""
Return generator for the training set, reading the image paths to load from dataframe
Args:
train_idg (ImageDataGenerator) image data generator for the training set.
train_df (dataframe): dataframe containing training data.
directory (str): directory, in which image files are held.
x_col (str): name of column in dataframe that holds filenames.
y_col (str): name of column in dataframe that holds filenames the label for images.
class_mode (str): one of "binary", "categorical", "input", "multi_output", "raw", sparse" or None.
target_size (tuple of integers (height, width)): default: The dimensions to which all images found will be resized.
batch_size (int): images per batch to be fed into model during training.
shuffle (Boolean): whether to shuffle the data
seed (int): random seed.
Returns:
train_gen (DataFrameIterator): iterator over training set
"""
train_gen = train_idg.flow_from_dataframe(dataframe = train_df,
directory=directory,
x_col = x_col,
y_col = y_col,
class_mode = class_mode,
target_size = target_size,
shuffle = shuffle,
seed = seed,
batch_size = batch_size)
return train_gen
def get_test_valid_generator(samplewise_center=True, samplewise_std_normalization=True, train_df=train_df, valid_df=valid_df,
test_df=test_df, directory=None, x_col='path', y_col='pneumonia_class', class_mode='binary',
target_size=(224, 224), sample_size = 100, batch_size=64, shuffle=True, seed=16):
"""
Return generator for validation set and test test set using
statistics from training set.
Args:
samplewise_center (Boolean): Set each sample mean to 0.
samplewise_std_normalization (Boolean): Divide each input by its std.
train_df (dataframe): dataframe containing training data.
valid_df (dataframe): dataframe containing validation data.
test_df (dataframe): dataframe containing test data.
directory (str): directory, in which image files are held.
x_col (str): name of column in dataframe that holds filenames.
y_col (str): name of column in dataframe that holds filenames the label for images.
class_mode (str): one of "binary", "categorical", "input", "multi_output", "raw", sparse" or None.
target_size (tuple of integers (height, width)): default: The dimensions to which all images found will be resized.
sample_size (int): size of sample to use for normalization statistics.
batch_size (int): images per batch to be fed into model during training.
shuffle (Boolean): whether to shuffle the data
seed (int): random seed.
Returns:
valid_generator and test_generator (DataFrameIterator): iterators over validation and test datsets respectively
"""
# get generator for sample dataset from training set
raw_train_generator = ImageDataGenerator().flow_from_dataframe(
dataframe = train_df,
directory = directory,
x_col = x_col,
y_col = y_col,
class_mode = class_mode,
target_size = target_size,
shuffle = shuffle,
seed = seed,
batch_size = sample_size)
# get one batch of sample data
batch = raw_train_generator.next()
data_sample = batch[0]
# get mean and std from sample data to normalize images in test set generator
image_generator = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization= True)
image_generator.fit(data_sample)
print(f'Mean of {sample_size} images from the training set: {np.mean(data_sample):.4}')
print(f'Std of {sample_size} images from the training set: {np.std(data_sample):.4}')
np.save('Training_sample_mean', np.mean(data_sample))
np.save('Training_sample_std', np.std(data_sample))
# get validation generator
valid_gen = image_generator.flow_from_dataframe(dataframe = valid_df,
directory = directory,
x_col = x_col,
y_col = y_col,
class_mode = class_mode,
target_size = target_size,
batch_size = batch_size,
shuffle = False,
seed = seed)
# get test generator
test_gen = image_generator.flow_from_dataframe(dataframe = test_df,
directory = directory,
x_col = x_col,
y_col = y_col,
class_mode = class_mode,
target_size = target_size,
batch_size = batch_size,
shuffle = False,
seed = seed)
return valid_gen, test_gen
# The models in this notebook will be based on DenseNet121 pretrained model with imagenet weights
model = DenseNet121(include_top=True, weights='imagenet')
model.summary()
def load_pretrained_model(pretrained_model=DenseNet121, weights='imagenet', transfer_layer='conv5_block16_1_conv',
transfer_layer_idx=420):
"""
Return pre-trained model with defined weights and defined transfer layer.
Args:
pretrained model (Model): the pretrained model.
weights (str): weights, which will be loaded.
transfer_layer (str): name of the layer from which the weights will be learned during training.
transfer_layer_idx (int): index of the previously specified transfer layer.
Returns:
pretrained model (Model): the pretrained model.
"""
model = pretrained_model(include_top=True, weights=weights)
transfer_layer = model.get_layer(transfer_layer)
pretrained_model = Model(inputs = model.input, outputs = model.output)
# freezing layers till transfer_layer
for layer in pretrained_model.layers[0:transfer_layer_idx]:
layer.trainable = False
# rechecking, if the freezeing was performed correctly
print('Pre-trained model layers and their trainability')
for layer in pretrained_model.layers:
print(layer.name, layer.trainable)
return pretrained_model
def build_model(pretrained_model, lr=0.0001, loss = 'binary_crossentropy', metrics = ['binary_accuracy'], dropout=0.5):
"""
Function builing a model by attaching layers after the pre-trained model.
Args:
pretrained model (Model): the pretrained model.
lr (Float): learning rate.
loss (str): loss function.
metrics (List): metrics to be monitored during training.
dropout (Float): fraction of neurons randomly swiching off during training.
Returns:
model (Model): model ready for training
"""
model = Sequential()
model.add(pretrained_model)
model.add(Dense(500, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(100, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(lr=lr)
loss = loss
metrics = metrics
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
return model
def build_simpler_model(pretrained_model, lr=0.0001, loss = 'binary_crossentropy', metrics = ['binary_accuracy']):
"""
Function builing a model by attaching single output neuron after the pre-trained model.
Args:
pretrained model (Model): the pretrained model.
lr (Float): learning rate.
loss (str): loss function.
metrics (List): metrics to be monitored during training.
dropout (Float): fraction of neurons randomly swiching off during training.
Returns:
model (Model): model ready for training
"""
model = Sequential()
model.add(pretrained_model)
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(lr=lr)
loss = loss
metrics = metrics
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
return model
def build_model_no_dropout(pretrained_model, lr=0.0001, loss = 'binary_crossentropy', metrics = ['binary_accuracy']):
"""
Function builing a model by attaching layers after the pre-trained model without dropout layers.
Args:
pretrained model (Model): the pretrained model.
lr (Float): learning rate.
loss (str): loss function.
metrics (List): metrics to be monitored during training.
Returns:
model (Model): model ready for training
"""
model = Sequential()
model.add(pretrained_model)
model.add(Dense(500, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(lr=lr)
loss = loss
metrics = metrics
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
return model
def save_model(model, model_name):
"""
Function saving a model to .json file.
Args:
model (Model): model to be saved.
model_name (str): name under which the model will be saved.
"""
model_json = model.to_json()
filename = str(model_name) + '.json'
with open(filename, 'w') as json_file:
json_file.write(model_json)
print(f'Saved model architecture under {filename}')
def plot_history(history_df):
"""
Function plotting the training history.
Args:
history_df (dataframe): dataframe containing training history
"""
N = len(history_df)
fig, ax = plt.subplots(1,2, figsize=(15,6))
ax[0].plot(np.arange(0, N), history_df["loss"], label="train loss", color='blue')
ax[0].plot(np.arange(0, N), history_df["val_loss"], label="valid loss", color='red')
ax[0].set_title("Training and validation loss")
ax[0].set_xlabel("# of epoch")
ax[0].set_ylabel("Loss")
ax[0].legend()
ax[1].plot(np.arange(0, N), history_df["binary_accuracy"], label="train acc", color='blue')
ax[1].plot(np.arange(0, N), history_df["val_binary_accuracy"], label="valid acc", color='red')
ax[1].set_title("Training and validation accuracy")
ax[1].set_xlabel("# of epoch")
ax[1].set_ylabel("Accuracy")
ax[1].legend()
fig.tight_layout(pad=3.0)
plt.show()
def train_model(model, model_name, train_gen, valid_gen, epochs=50,
monitor='val_loss', mode='min', patience=15):
"""
Function training the model with ModelCheckpoint and EarlyStopping,
saving training history in form of dataframe to .pkl file and plotting the history.
Args:
model (Model): model to be saved.
model_name (str): name under which the training history will be saved.
train_gen (DataFrameIterator): iterator over training set.
valid_gen (DataFrameIterator): iterator over validation set.
epochs (int): Number of epochs to train the model.
monitor (str): metrics to be monitored.
mode (one of {'auto', 'min', 'max'}): the decision to overwrite the current save file
is made based on either the maximization or the minimization
of the monitored quantity.
patience (int): number of training epochs without model improvement before quitting.
Returns:
history (model.history): history of model training
"""
# set callbacks
checkpoint = ModelCheckpoint(str(model_name + '.best.hdf5'),
monitor = monitor,
verbose = 1,
save_best_only = True,
mode = mode,
save_weights_only = True)
early = EarlyStopping(monitor = monitor,
mode = mode,
patience=patience)
callbacks_list = [checkpoint, early]
# get fit_generator
history = model.fit_generator(train_gen,validation_data = valid_gen,
epochs = epochs,
callbacks = callbacks_list)
# create dataframe containing training history and save it to .pkl
history_df = pd.DataFrame(history.history)
filename = str(model_name + '_history.pkl')
history_df.to_pickle(filename)
# plot history
plot_history(history_df)
return history
def plot_prediction_distribution(pred_array):
"""
Function plotting the distribution of predictions.
Args:
pred_array (numpy array): array containing predictions
"""
plt.figure(figsize=(6,4))
plt.hist(pred_array, bins=50)
plt.title('Distribution of model predictions')
plt.xlabel('Prediction: probability of pneumonia')
plt.ylabel('# number of scans')
plt.show()
print(f'Model prediction min: {pred_array.min():.3}')
print(f'Model prediction max: {pred_array.max():.3}')
def plot_AUROC(GT_array, pred_array):
"""
Function plotting ROC curve and displaying AUC.
Args:
GT_array (numpy array): arry containing ground truth.
pred_array (numpy array): arry containing predictions.
Returns:
fpr (numpy array): array containing false postive rates for different treshold values.
tpr (numpy array): array containing true postive rates for different treshold values.
tresholds (numpy array): array containing treshold values used for generation of the ROC curve.
"""
plt.figure(figsize=(6,4))
fpr, tpr, thresholds = roc_curve(GT_array, pred_array)
plt.plot(fpr, tpr, label = '%s (AUC: %0.2f)' % ('Pneumonia', auc(fpr, tpr)))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='black',label='Random choice')
plt.legend()
plt.show()
return fpr, tpr, thresholds
def plot_precision_recall_curve(GT_array, pred_array):
"""
Function plotting precision-recall curve and displaying average precision score.
Args:
GT_array (numpy array): arry containing ground truth.
pred_array (numpy array): arry containing predictions.
Returns:
precision (numpy array): array containing precision values for different treshold values.
recall (numpy array): array containing recall for different treshold values.
tresholds (numpy array): array containing treshold values used for generation of the precision-recall curve.
"""
plt.figure(figsize=(6,4))
precision, recall, thresholds = precision_recall_curve(GT_array, pred_array)
plt.plot(recall, precision, label = '%s (AP Score:%0.2f)' % ('Pneumonia', average_precision_score(GT_array,pred_array)))
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()
return precision, recall, thresholds
def calc_f1(prec,recall):
"""
Function calculating F1 score.
Args:
prec (float): precision value.
recall (float): recall value.
Returns:
2*(prec*recall)/(prec+recall) (float): F1-score
"""
return 2*(prec*recall)/(prec+recall)
def plot_f1_tresh(GT_array, pred_array):
"""
Function plotting f1 score vs treshold.
Args:
GT_array (numpy array): arry containing ground truth.
pred_array (numpy array): arry containing predictions.
Returns:
f1 (numpy array): array containing f1 score for different treshold values.
tresholds (numpy array): array containing treshold values used for generation of the precision-recall curve.
"""
precision, recall, thresholds = precision_recall_curve(GT_array, pred_array)
f1 = calc_f1(precision, recall)
plt.figure(figsize=(6,4))
plt.plot(f1[:-1], thresholds)
plt.xlabel('F1 score')
plt.ylabel('Treshold')
plt.show()
return f1, thresholds
def make_evaluation_df(GT_array, pred_array, treshold):
"""
Function returing dataframe containing ground truth, model prediction and class based on chosen treshold.
Args:
GT_array (numpy array): arry containing ground truth.
pred_array (numpy array): arry containing predictions.
treshold(float): value of chosen treshold
Returns:
evaluation_df (dataframe): dataframe containing ground truth, model prediction and class based on chosen treshold.
"""
evaluation_df = pd.DataFrame(GT_array, columns=['Ground_truth'])
evaluation_df['Pred'] = pred_array
evaluation_df['Pred_tresh'] = evaluation_df['Pred'] >= treshold
evaluation_df['Pred_tresh'] = evaluation_df['Pred_tresh'].replace(True, 1).replace(False, 0)
display(evaluation_df.head())
return evaluation_df
def predict_and_evaluate_model(model, model_name, test_gen, steps):
"""
Function predicting labels, plotting prediction distribution,
AUROC curve and precision recall curve.
Function also saves model predictions and ground truth in as .npy files.
Args:
model (Model): model, which will be used for generation of predictions.
model_name (str): prefix under which the prediction array and the ground truth array will be saved.
test_gen (DataFrameIterator): iterator over test set.
steps (int): Total number of steps (batches of samples) before declaring the prediction round finished.
Returns:
GT_array (numpy array): arry containing ground truth.
pred_array (numpy array): arry containing predictions.
evaluation_dic (dict): dictonary containing values from AUROC and precision-recall curves.
"""
# generate predictions
pred_Y = model.predict_generator(test_gen, steps=steps, verbose = True)
# get ground truth labels
ground_truth = test_gen.labels
# save prediction and ground truth as .npy files.
np.save(model_name + '_pred_Y.npy', pred_Y)
np.save(model_name + '_GT.numpy', test_gen.labels)
# plot prediction distribution, AUROC and precision-recall curve
plot_prediction_distribution(pred_Y)
fpr, tpr, thresholds_auroc = plot_AUROC(ground_truth, pred_Y)
precision, recall, thresholds_pr = plot_precision_recall_curve(ground_truth, pred_Y)
# create distonary containing values from AUROC and precision-recall curves
evaluation_dic = {'fpr': fpr,
'tpr' : tpr,
'thresholds_auroc': thresholds_auroc,
'precision': precision,
'recall': recall,
'thresholds_pr': thresholds_pr}
return pred_Y, ground_truth, evaluation_dic
# Initializing training generator
train_idg = image_generator_train()
train_gen = get_generator_train(train_idg, train_df)
# Looking at some examples of our training data to understand the extent to which data is being augumented prior to training
train_x, train_y = next(train_gen)
fig, m_axs = plt.subplots(4, 4, figsize = (16, 16))
for (image, label, plot) in zip(train_x, train_y, m_axs.flatten()):
plot.imshow(image[:,:,0], cmap = 'bone')
if label == 1:
plot.set_title('Pneumonia')
else:
plot.set_title('No Pneumonia')
plot.axis('off')
# Accessing the order of the classes in training generator
train_gen.class_indices
# Initializing validation and test generators
valid_gen, test_gen = get_test_valid_generator()
# Model 1:
## transfer layer: conv5_block16_1_conv (idx 420)
## learning rate E-4
## dropout 0.2
model_1_pretrained = load_pretrained_model(pretrained_model=DenseNet121)
model_1 = build_model(model_1_pretrained, dropout=0.2)
save_model(model_1, 'model_1')
model_1_hist = train_model(model_1, 'model_1', train_gen, valid_gen, epochs=100)
pred_Y_1, ground_truth_1, evaluation_dic_1 = predict_and_evaluate_model(model_1, "model_1", test_gen, steps=len(test_df)/64)
In the next model the dropout will be increased to 0.5 to avoid overfitting.
# The same as model 1 with increased dropout
# Model 2:
## transfer layer: conv5_block16_1_conv (idx 420)
## learning rate E-4
## dropout 0.5
model_2_pretrained = load_pretrained_model(pretrained_model=DenseNet121)
model_2 = build_model(model_2_pretrained, dropout=0.5)
save_model(model_2, 'model_2')
model_2_hist = train_model(model_2, 'model_2', train_gen, valid_gen, epochs=100)
pred_Y_2, ground_truth_2, evaluation_dic_2 = predict_and_evaluate_model(model_2, "model_2", test_gen, steps=len(test_df)/64)
The next model will be based on model 1 with increased starting learnig rate, as smaller learning rate ranges increase the risk of stucking in local minimum resulting in lack of improvement in validation accuracy.
# The same as model 1 with increased learning rate
# Model 3:
## transfer layer: conv5_block16_1_conv (idx 420)
## learning rate E-3
## dropout 0.2
model_3_pretrained = load_pretrained_model(pretrained_model=DenseNet121)
model_3 = build_model(model_3_pretrained, lr=0.01, dropout=0.2)
save_model(model_3, 'model_3')
model_3_hist = train_model(model_3, 'model_3', train_gen, valid_gen, epochs=100)
pred_Y_3, ground_truth_3, evaluation_dic_3 = predict_and_evaluate_model(model_3, "model_3", test_gen, steps=len(test_df)/64)
The training and validation loss did not decrease, while the accuracies did not increase during training, meaning that the model did not learn. Those signals are very noisy, suggesting that the learning rate ranges are too high and thus overshooting minimum/local minima.
The AUC amounts to 0.5, a value equal to random choice.
The next model will be based on model 1 with more layers frozen to avoid overfitting.
# The same as model 1 with more layers frozen
# Model 4:
## transfer layer: avg_pool (idx 427)
## learning rate E-4
## dropout 0.2
model_4_pretrained = load_pretrained_model(pretrained_model=DenseNet121, transfer_layer='avg_pool',
transfer_layer_idx=427)
model_4 = build_model(model_4_pretrained, dropout=0.2)
save_model(model_4, 'model_4')
model_4_hist = train_model(model_4, 'model_4', train_gen, valid_gen, epochs=50)
pred_Y_4, ground_truth_4, evaluation_dic_4 = predict_and_evaluate_model(model_4, "model_4", test_gen, steps=len(test_df)/64)
In the next model only prediction layer will be attached to the pretrained model, to decrease its complexity and thus avoiding overfitting.
# The same as model 4 with only prediction layer attached to the pretrained model
# Model 5:
## transfer layer: avg_pool (idx 427)
## learning rate E-4
model_5_pretrained = load_pretrained_model(pretrained_model=DenseNet121, transfer_layer='avg_pool',
transfer_layer_idx=427)
model_5 = build_simpler_model(model_5_pretrained)
save_model(model_5, 'model_5')
model_5_hist = train_model(model_5, 'model_5', train_gen, valid_gen, epochs=100)
pred_Y_5, ground_truth_5, evaluation_dic_5 = predict_and_evaluate_model(model_5, "model_5", test_gen, steps=len(test_df)/64)
The starting learning rate will be increased in the next model.
# The same as model 5 with increased learning rate
# Model 5:
## transfer layer: avg_pool (idx 427)
## learning rate E-3
model_6_pretrained = load_pretrained_model(pretrained_model=DenseNet121, transfer_layer='avg_pool',
transfer_layer_idx=427)
model_6 = build_simpler_model(model_6_pretrained, lr=0.001)
save_model(model_6, 'model_6')
model_6_hist = train_model(model_6, 'model_6', train_gen, valid_gen, epochs=60)
pred_Y_6, ground_truth_6, evaluation_dic_6 = predict_and_evaluate_model(model_6, "model_6", test_gen, steps=len(test_df)/64)
model_7_pretrained = load_pretrained_model(pretrained_model=DenseNet121, transfer_layer='conv5_block1_1_conv',
transfer_layer_idx=428)
model_7 = build_model(model_7_pretrained, dropout=0.2)
save_model(model_7, 'model_7')
model_7_hist = train_model(model_7, 'model_7', train_gen, valid_gen, epochs=100)
pred_Y_7, ground_truth_7, evaluation_dic_7 = predict_and_evaluate_model(model_7, "model_7", test_gen, steps=len(test_df)/64)
data = {'Model': ['Model_1', 'Model_2', 'Model_3', 'Model_4', 'Model_5', 'Model_6', 'Model_7'],
'Transfer layer': ['conv5_block16_1_conv', 'conv5_block16_1_conv', 'conv5_block16_1_conv',
'avg_pool', 'avg_pool', 'avg_pool', 'avg_pool'],
'Transfer layer idx': [420, 420, 420, 427, 427, 427, 427],
'Embedding model': ['standard', 'standard', 'standard', 'standard', 'simpler', 'simpler', 'standard'],
'Starting learning rate' : ['E-4', 'E-4', 'E-3', 'E-4', 'E-4', 'E-3', 'E-4'],
'Dropout': [0.2, 0.5, 0.2, 0.2, 0.0, 0.0],
'Best Epoch /Epochs set)' : ['5/100', '5/100', '14/100', '27/50', '100/100', '39/60', '.'],
'Prediction distribution': ['0.08 - 0.93', '0.16 - 0.87', '0.37 - 0.63', '0.07 - 0.93', '0.42 - 0.59', '0.30 - 0.74',
'.'],
'Prediction distribution: peaks': ['Two (wrong ratio)', 'Single', 'Single',
'Two (ratio closer to 80/20)', 'Single', 'Single','.'],
'AUC' : [0.61, 0.56, 0.50, 0.57, 0.64, 0.65, 0],
'AP Score' : [0.28, 0.23, 0.20, 0.24, 0.31, 0.29,0],
'Comment' : ['Highly overfitting after first epochs', 'Highly overfitting', 'Underfitting',
'Overfitting', 'Slow learning', 'No learning', '.']}
models_summary = pd.DataFrame(data=data)
models_summary
# Loading the numpy arrays contaning ground truth and predictions
pred_Y_4 = np.load('model_4_pred_Y.npy')
pred_Y_GT = np.load('model_4_GT.numpy.npy')
# Calculating of false positive rate, true positive rate, precision, recall for different treshold values
fpr_4, tpr_4, thresholds_auroc_4 = roc_curve(pred_Y_GT, pred_Y_4)
precision_4, recall_4, thresholds_pr_4 = precision_recall_curve(pred_Y_GT, pred_Y_4)
# Calculating treshold for different F1 score values
f1_4, thresholds_pr_4 = plot_f1_tresh(pred_Y_GT, pred_Y_4)
# Checking values at the end of the f1 score numpy array
f1_4[-10:]
# Chosing treshold maximizing F1 score
f1_4_max = np.max(f1_4[:-3]) #omitting NaN values
f1_4_max
# Extracting precision and recall for the chosen treshold
idx = np.where(f1_4[:-3] == f1_4[:-3].max())
tresh_4_best = thresholds_pr_4[idx]
print(f'Treshold maximizing f1 score: {tresh_4_best[0]:.3}')
print(f'For this treshold precision equals to: {precision_4[idx][0]:.3}')
print(f'For this treshold recall equals to: {recall_4[idx][0]:.3}')
# Classification of the scan on the basis of chosen treshold
evaluation_df_4 = make_evaluation_df(pred_Y_GT, pred_Y_4, tresh_4_best[0])
# Confusion matrix
confusion_matrix_4 = confusion_matrix(evaluation_df_4['Ground_truth'], evaluation_df_4['Pred_tresh'])
confusion_matrix_4
# Calculating specificity
specificity_4 = confusion_matrix_4[0,0] / (confusion_matrix_4[0,0] + confusion_matrix_4[0,1])
print(f'For this treshold specificity equals to: {specificity_4:.3}')
# Displaying images from the test set with first number corresponding to the ground truth
# and the second to the model prediction
test_x, test_y = next(test_gen)
fig, m_axs = plt.subplots(8, 8, figsize = (20, 20))
i = 0
for (image, label, plot) in zip(test_x, test_y, m_axs.flatten()):
plot.imshow(image[:,:,0], cmap = 'bone')
if label == 1:
if pred_Y_4[i] >= tresh_4_best[0]:
plot.set_title('1, 1')
else:
plot.set_title('1, 0')
else:
if pred_Y_4[i] >= tresh_4_best[0]:
plot.set_title('0, 1')
else:
plot.set_title('0, 0')
plot.axis('off')
i += 1
The chosen model 4 has maximal F1 score amounting to 0.353, which is comparable to Radiologist 2 from Ref [1], who had f1 score 0.356. However, the model has too many false positives, thus it should be further tuned to achieve better performance. Further hyperparameter tuning of model 4 could be performed, with following steps:
[1] Rajpurkar, Pranav, Jeremy Irvin, Kaylie Zhu, Brandon Yang, Hershel Mehta, Tony Duan, Daisy Ding, et al. “CheXNet: Radiologist-Level Pneumonia Detection on Chest X-Rays with Deep Learning.” ArXiv:1711.05225 [Cs, Stat], December 25, 2017